Prototyping text source lookup via various API's and word cloud rendering with added features. Will eventually be a webapp most likely.
NYTReader - API reader and search/plot/pos functionsTwitterReader - API reader and search/plot/pos functionsWordArt - Image handling and wordlcoudsimport requests
import json
import numpy as np
import math
import time
import pandas as pd
import nltk
from matplotlib import pyplot as plt
import os
from IPython.display import display
import matplotlib
matplotlib.rcParams.update({'font.size': 14})
from PIL import Image, ImageOps, ImageEnhance
from wordcloud import WordCloud, STOPWORDS
from requests_oauthlib import OAuth1
%run words_api.py
%run words_visualization.py
%%html
<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<p>To toggle on/off display of the raw code for this notebook, <a href="javascript:code_toggle()">click here</a>.
private_data = json.loads(open('private_data.json').read())
nyt_key = private_data['nytimes-api-key']
twitter_auth_dict = private_data['twitter-auth']
# sf bay
reader_sf = NYTReader(nyt_key)
reader_sf.get_search_term(search='headline:("San Francisco", "Oakland", "Silicon Valley")',
begin_date='20010101',
article_limit=200,
verbose=True)
This example only prints proper nouns. Base class dict WordsAPI.pos contains different types of parts of speech to use
for source in reader_sf.text_sources.keys():
reader_sf.plot_top_words(df=reader_sf.df_from_text([source]), n=50, title=source,
pos_list=reader_sf.pos['include'], exclude=False, print_tuple=False)
reader_sf.get_single_entity(1)
reader_sf.search('Media')
This method will return a list of words of the specified parts of speech
text_words = reader_sf.get_words(source_list=['headlines', 'snippets'], pos_list=None, exclude=False)
print('%d total words, %d unique words' % (len(text_words), len(set(text_words))))
# get list of text sources
text_list = reader_sf.build_list_from_sources(source_list=['headlines','snippets'])
# make image and wordcloud
wa= WordArt()
image = wa.create_mask_image('source images/Location_Map_San_Francisco_Bay_Area_edited.gif')
wa.make_wordcloud(text=text_list, image=image,
filename="final images/sf_bay.png",
colormap='gist_earth', background='lightblue')
auth = OAuth1(client_key=twitter_auth_dict['client_key'],
client_secret=twitter_auth_dict['client_secret'],
resource_owner_key=twitter_auth_dict['resource_owner_key'],
resource_owner_secret=twitter_auth_dict['resource_owner_secret'])
reader_twitter = TwitterReader(auth)
reader_twitter.get_search_term(search='Oakland', tweet_limit=200, verbose=True, recent=True)
Note this doesn't always work reliably, consider using the get_recent_tweets() method instead
reader_twitter.get_search_term(geo='37.817941,-122.352714,20mi', tweet_limit=200, verbose=True)
Note these take a while to load so limits should be set lower
reader_twitter_rt = TwitterReader(auth)
reader_twitter_rt.get_recent_tweets(location_box='-122.75,36.8,-121.75,37.8', tweet_limit=10, verbose=True)
This example only prints proper nouns. Base class dict WordsAPI.pos contains different types of parts of speech to use
for source in reader_twitter.text_sources.keys():
reader_twitter.plot_top_words(df=reader_twitter.df_from_text([source]), n=50, title=source,
pos_list=reader_twitter.pos['include'], exclude=False, print_tuple=False)
reader_twitter.get_single_entity(1)
reader_twitter.search('Warriors')
This method will return a list of words of the specified parts of speech
text_words = reader_twitter.get_words(source_list=['tweets'], pos_list=None, exclude=False)
print('%d total words, %d unique words' % (len(text_words), len(set(text_words))))
# get list of text sources
text_list = reader_twitter.build_list_from_sources(source_list=['tweets'])
# make image and wordcloud
wa= WordArt()
image = wa.create_mask_image('source images/Location_Map_San_Francisco_Bay_Area_edited.gif')
wa.make_wordcloud(text=text_list, image=image,
filename="final images/sf_bay_twitter.png",
colormap='gist_earth', background='lightblue',
additional_stopwords=['https', 'co', 'amp', 'RT'],
reset_stopwords=False)
# california
reader_ca = NYTReader(nyt_key)
reader_ca.get_search_term(search='headline:("California")',
article_limit=200,
verbose=True)
# get list of text sources
text_list = reader_ca.build_list_from_sources(source_list=['headlines','snippets'])
# make image and wordcloud
wa= WordArt()
image = wa.create_mask_image('source images/kalifornien_schwarz.png', transparency=True)
wa.make_wordcloud(text=text_list, image=image,
filename="final images/ca.png",
colormap='gist_earth', background='gold')
# renewable energy
reader_green = NYTReader(nyt_key)
reader_green.get_search_term(search='headline:("Solar", "Wind", "Renewable")',
begin_date='20010101',
article_limit=200,
verbose=True)
# get list of text sources
text_list = reader_green.build_list_from_sources(source_list=['headlines','snippets'])
# make image and wordcloud
wa= WordArt()
image = wa.create_mask_image('source images/Renewable-Energy-Consultants.png', transparency=True)
wa.make_wordcloud(text=text_list, image=image,
filename="final images/green.png",
colormap='gist_earth', background='forestgreen')
# manhattan project
reader_bomb = NYTReader(nyt_key)
reader_bomb.get_search_term(search='headline:("Manhattan Project")',
article_limit=100,
verbose=True)
# get list of text sources
text_list = reader_bomb.build_list_from_sources(source_list=['headlines','snippets'])
# make image and wordcloud
wa= WordArt()
image = wa.create_mask_image('source images/ABomb.png', transparency=True, rotate=270)
wa.make_wordcloud(text=text_list, image=image,
filename="final images/bomb.png",
colormap='inferno', background='darkred')
# cape cod
reader_cc = NYTReader(nyt_key)
reader_cc.get_search_term(search='headline:("Cape Cod")',
begin_date='19010101',
article_limit=200,
verbose=True)
# get list of text sources
text_list = reader_cc.build_list_from_sources(source_list=['headlines','snippets'])
# make image and wordcloud
wa= WordArt()
image = wa.create_mask_image('source images/capecod.PNG')
wa.make_wordcloud(text=text_list, image=image,
filename="final images/capecod.png",
colormap='gist_earth', background='lightblue')
# utilities in mi
reader_mi = TwitterReader(auth)
reader_mi.get_search_term(search=['Consumers Energy', 'ConsumersEnergy', 'CE', 'DTE'],
geo='43.900689,-84.782134,300mi',
tweet_limit=300,
verbose=True)
# get list of text sources
text_list = reader_mi.build_list_from_sources(source_list=['tweets'])
# make image and wordcloud
wa= WordArt()
image = wa.create_mask_image('source images/Michigan.PNG')
wa.make_wordcloud(text=text_list, image=image,
filename="final images/utilities.png",
colormap='gist_earth', background='lightblue',
additional_stopwords=['https', 'co', 'amp', 'RT'])
help(reader_sf)
print('Data: %s' % ', '.join([key for key in reader_sf.__dict__.keys()]))
help(reader_twitter)
print('Data: %s' % ', '.join([key for key in reader_twitter.__dict__.keys()]))
help(wa)